# Cargar librerías
library(ggplot2)
library(readr)
library(dplyr)
library(viridis)
library(ggpointdensity)
library(cowplot)

st <- read_csv("distances_comparison_MLST.csv")
rTA <- read_csv("distances_comparison_RosmerTA.csv")
ssp <- read_csv("distances_comparison_SspBCDE.csv")

num_colors <- 300
viridis_colors <- viridis(num_colors, option = "H") 
viridis_new <- colorRampPalette(c("yellow", "#54FF9F"))(100)
viridis_total <- c(viridis_colors,viridis_new)


st <- st[st$ST_relation != "unknown",]
rTA <- rTA[rTA$ST_relation != "unknown",]
ssp <-  df3[df3$dist_emboss <= 0.5,] # Avoid outliers

df_names <- c("st","rTA","ssp")

for (df_name in df_names) {
  # Get the true dataframe
  df <- get(df_name)
  sampling <- nrow(df)*0.1

  sample_data <- df[sample(nrow(df), sampling), ]

  # Draw the dispersion plot
  plot<- ggplot(data = sample_data, mapping = aes(x= dist_emboss, y = dist_iqtree)) +
    geom_pointdensity()+
    #geom_point(data = df2, alpha = 0.5) +
    #geom_point(data = df_counted[df_counted$freq > 1000, ], aes(color = freq), alpha = 1) + 
    theme_minimal() + labs(y = "Phylogenetical distance", x = "SspBCDE distance") +
    scale_color_gradientn(colors = viridis_colors) +  geom_smooth(color = "darkred", se = FALSE, alpha=0.3) +
    facet_wrap(~ST_relation) + ylim(0,0.125) + theme(legend.position = "none")
  
  assign(paste0(df_name, "_gg"), plot) # Assign the name to the plot
}

B <- plot_grid(st, rTA, ssp, ncol= 3, nrow =1)

# PHYLOGENY + VARIANTS
library(ggtree)
library(ggnewscale)
library(treeio)
library(ape)
library(ggplot2)
library(tidyverse)


# PHYLOGENY PART
tree <- read.tree("./iqtree_Sept2024_def.treefile")
tree <- root(tree, outgroup = "ab04946", resolve.root = TRUE) # Root of the tree

variants_meta <- read.table("./variant_matrix.tsv", sep="\t", header = T, row.names = 1)
# Filter variants that are only present in tree branches
aligned_meta <- variants_meta[rownames(variants_meta) %in% tree$tip.label, ]


library(paletteer)

variant_list <- unique(c(unique(aligned_meta$PD.T4), unique(aligned_meta$PD.T7), unique(aligned_meta$RM), unique(aligned_meta$RosmerTA), unique(aligned_meta$SspBCDE)))
variant_colors <- paletteer_d("ggsci::default_igv")
variant_colors[6] <- "#6BD76BFF" 
variant_colors[10] <- "#BA6338FF"


mlst <- read.table("./mlst_ab_freq_wored100.tsv", sep="\t",stringsAsFactor = FALSE)

# Define the main MLST groups in the tree
st123 <- split(mlst$V1, mlst$V2)
otus <- groupOTU(tree, st123)
phylo <- ggtree(otus,aes(color = group),layout = "circular", size = 0.8) +
  geom_treescale(x = 0,y = 6100, offset = 10) + 
  vexpand(1, direction = 1) +
  scale_color_manual(values = c("black", "#E41A1C", "#377EB8","goldenrod1", "#4DAF4A", "#984EA3", "#FF7F00", "#A65628", "#F781BF"),
                     breaks = c(0,"ST1","ST2","ST3","ST10","ST25","ST78","ST79","ST499"), labels=c("other", "ST1", "ST2", "ST3","ST10", "ST25", "ST78", "ST79", "ST499"), guide = "none") 


# BARPLOT PART
st_variants <- read.table("mlst_defense_variants.tsv", sep="\t", header = T)

df_long <- st_variants %>%
  pivot_longer(cols = c("RosmerTA", "SspBCDE", "RM", "PD.T4", "PD.T7", "Cas"), names_to = "Systems", values_to = "Variants")
 
# DRAW FIGUERES
library(cowplot)

plots <- list()
for (sys in names(aligned_meta)) {
  
  # 1. Filter variants from each systems
  variant_values <- na.omit(unique(aligned_meta[[sys]]))
  variant_values <- variant_values[variant_values != "-"]  # Omitir "-"
  
  variant_colors_per_sys <- setNames(variant_colors, variant_values)
  
  
  # PHYLOGENY
  variant_sys <- aligned_meta %>% select(sys)
  
  
  g <- gheatmap(phylo, variant_sys, color = NA, colnames = FALSE, width = 0.8)  +  scale_fill_manual(values = variant_colors_per_sys, na.value = "white") + labs(fill= "Variants") + 
    theme(legend.position = "none")
  # Add titles
  g_labeled <- cowplot::ggdraw() +
    draw_label(sys, fontface = 'bold', x = 0.5, hjust = 0.5, size = 14) +
    draw_plot(g, y = -0.05, height = 1)
  
  # BARPLOT
  
  # Filter data to each system
  variant_per_sys <- df_long %>%
    filter(Systems == sys, MLST != "-", Variants != "-") %>%
    count(MLST, Variants) %>%
    group_by(MLST) %>%
    mutate(total = sum(n), per = n / total * 100)
  
  g_bar <- ggplot(variant_per_sys, aes(x = factor(MLST), y = per, fill = Variants)) +
    geom_bar(stat = "identity") +
    labs(x = "", y = "Percentage", fill = "Variants") +
    scale_fill_manual(values = variant_colors_per_sys) +
    guides(fill = "none") +
    theme_minimal(base_size = 12) +
    theme(axis.text.x = element_text(size = 15,angle = 45, hjust = 1))
  
  empty <- ggplot() + theme_minimal()
  ## 3. COMBINE PHYLOGENY AND BARPLOTS
  
  bar_panel <- plot_grid(
    empty, g_bar,
    nrow = 2,
    rel_heights = c(1,4)
  )
  
  full_panel <- plot_grid(
    g_labeled + ggtitle(sys),
    bar_panel,
    ncol = 2,
    rel_widths = c(5, 2)  
  )
  
  plots[[sys]] <- full_panel
  
}
# Combinar todos los gráficos


A <- plot_grid(plotlist = plots, ncol = 3, nrow = 2, labels= c("Cas","PD-T4-5","PD-T7-5","R-M","RosmerTA", "SspBCDE"), label_size = 25)

# Save figures
pdf("fig2.pdf", width =30, height=25, paper = "special")
plot_grid(A, B, nrow = 2, labels= c("A","B"), rel_heights = c(3,2))
dev.off()

